{"cells":[{"metadata":{"id":"07603013F1734332855A4D23AB115701","mdEditEnable":false},"cell_type":"markdown","source":"# 下面是简单的取平均做融合"},{"outputs":[],"execution_count":2,"source":"import pandas as pd\nchunwenben=pd.read_csv(\"first_zzp/result/sub_logestic_nn_allchunwenben.csv\",header=None)[[2]]\nnn_ln=pd.read_pickle(\"/home/kesci/test_old_result/siamese_ln230000.pickle\")\nlgb=pd.read_pickle(\"/home/kesci/test_old_result/lgb_1200.pickle\")","cell_type":"code","metadata":{"trusted":true,"collapsed":false,"id":"027ADEC716034C6D8908FD9159D3DB6C","scrolled":false}},{"metadata":{"id":"462ACD7D32764B698E82A06DB1F88778","collapsed":false},"cell_type":"code","outputs":[],"source":"preds=(lgb[0]+nn_ln+chunwenben[2])/3","execution_count":null},{"metadata":{"id":"FAFD747834A6463F819CE14AED45A28E","mdEditEnable":false},"cell_type":"markdown","source":"# 下面是使用lr进行blending"},{"metadata":{"id":"86C5EE2B960A4895B6C6469E7D00AC9B","collapsed":false,"scrolled":false},"cell_type":"code","outputs":[],"source":"import pandas as pd\nesim=pd.read_pickle(\"/home/kesci/blending/esim_fea_chunwenben.pickle\")\nlcnn=pd.read_pickle(\"/home/kesci/blending/lcnn_fea_chunwenben.pickle\")\ncnn=pd.read_pickle(\"/home/kesci/blending/textcnn_fea_chunwenben.pickle\")\ntfidf=pd.read_pickle(\"/home/kesci/blending/textcnn_tfidf_fea_chunwenben.pickle\")\nsiamese=pd.read_pickle(\"/home/kesci/blending/siamese_fea_chunwenben.pickle\")\nrnn=pd.read_pickle(\"/home/kesci/blending/textcnn_rnn_fea_chunwenben.pickle\")\n# sia_have_ctr=pd.read_pickle(\"/home/kesci/blending/siamese_ln230000.pickle\")\n# sia_no_ctr=pd.read_pickle(\"/home/kesci/blending/siamese_fea_no_ctr.pickle\")\n# lgb=pd.read_pickle(\"/home/kesci/blending/lgb_1200.pickle\")\nlabel=pd.read_pickle(\"/home/kesci/blending/train_data5000w-6000w.pickle\")['label']","execution_count":1},{"metadata":{"id":"548557D4C29C47E5837E8A6136327D82","collapsed":false,"scrolled":false},"cell_type":"code","outputs":[],"source":"all_blending_train=pd.concat([esim,lcnn,cnn,tfidf,siamese,rnn],axis=1)","execution_count":2},{"metadata":{"id":"E03869323268431DAC13BD202748A1B6","collapsed":true,"scrolled":false},"cell_type":"code","outputs":[{"output_type":"execute_result","metadata":{},"data":{"text/plain":"             esim      lcnn       cnn     tfidf   siamese       rnn\n0        0.183343  0.210926  0.178435  0.166411  0.041594  0.114207\n1        0.127543  0.170234  0.159299  0.193318  0.068709  0.084089\n2        0.256977  0.220685  0.304244  0.245783  0.189384  0.124110\n3        0.329264  0.252470  0.316229  0.294219  0.176245  0.180180\n4        0.293926  0.269850  0.330358  0.306480  0.137237  0.243230\n5        0.277756  0.234906  0.306567  0.254959  0.239885  0.185155\n6        0.290975  0.236100  0.287141  0.261831  0.225257  0.182645\n7        0.281233  0.220378  0.278347  0.246527  0.227422  0.155311\n8        0.073633  0.093988  0.102847  0.091150  0.037049  0.126172\n9        0.099157  0.087732  0.093161  0.092697  0.108357  0.091863\n10       0.061796  0.105573  0.143059  0.097542  0.069790  0.120047\n11       0.123377  0.140807  0.150359  0.128166  0.102603  0.160087\n12       0.293368  0.201807  0.293641  0.228811  0.204723  0.189470\n13       0.078154  0.131898  0.118266  0.170003  0.100291  0.111570\n14       0.398148  0.255469  0.377622  0.380209  0.403914  0.382533\n15       0.165693  0.080638  0.220998  0.142363  0.109738  0.101367\n16       0.328428  0.240529  0.361178  0.405237  0.330032  0.352683\n17       0.380304  0.273786  0.344722  0.362329  0.290256  0.350222\n18       0.349416  0.257773  0.317110  0.287808  0.229717  0.355011\n19       0.420683  0.249101  0.368600  0.310864  0.286035  0.316092\n20       0.395232  0.238495  0.383211  0.401805  0.355147  0.359722\n21       0.063580  0.077431  0.116238  0.107871  0.057790  0.147502\n22       0.047393  0.027277  0.068281  0.041348  0.064397  0.060155\n23       0.108533  0.091941  0.134770  0.142344  0.057161  0.142447\n24       0.022149  0.028021  0.038434  0.055003  0.072808  0.085148\n25       0.063761  0.085595  0.101943  0.130718  0.026504  0.120189\n26       0.252082  0.250803  0.239114  0.243267  0.274929  0.194358\n27       0.111844  0.102552  0.111934  0.119268  0.056281  0.172390\n28       0.043412  0.074418  0.086375  0.084705  0.025406  0.127015\n29       0.033502  0.044718  0.080294  0.068978  0.030750  0.103222\n...           ...       ...       ...       ...       ...       ...\n9999970  0.165128  0.157426  0.159998  0.192329  0.211357  0.145061\n9999971  0.207487  0.181840  0.245273  0.166036  0.198161  0.185849\n9999972  0.165988  0.149326  0.175779  0.150465  0.196418  0.159729\n9999973  0.139762  0.222923  0.307985  0.183440  0.266340  0.181391\n9999974  0.215708  0.180050  0.178238  0.248254  0.186034  0.244965\n9999975  0.081802  0.115591  0.118820  0.108942  0.195108  0.175034\n9999976  0.186093  0.193749  0.389246  0.194351  0.275788  0.216140\n9999977  0.290551  0.201620  0.285007  0.314757  0.117171  0.258349\n9999978  0.166379  0.162729  0.227634  0.201888  0.082588  0.182384\n9999979  0.070790  0.069349  0.106155  0.111007  0.051354  0.147986\n9999980  0.034841  0.029093  0.030690  0.041065  0.014371  0.020332\n9999981  0.122875  0.151999  0.180923  0.184422  0.186763  0.167568\n9999982  0.163226  0.206355  0.213456  0.298012  0.154435  0.302324\n9999983  0.076904  0.075969  0.060056  0.106338  0.070358  0.116356\n9999984  0.370937  0.293974  0.369704  0.378480  0.271502  0.333590\n9999985  0.261797  0.218071  0.206922  0.329579  0.266202  0.196525\n9999986  0.343210  0.266212  0.248553  0.236931  0.233802  0.273509\n9999987  0.353125  0.252692  0.301002  0.322517  0.236588  0.182779\n9999988  0.276885  0.201802  0.225492  0.244956  0.188742  0.135402\n9999989  0.287292  0.227126  0.274997  0.305922  0.196734  0.181266\n9999990  0.069087  0.082701  0.126623  0.107481  0.045689  0.122898\n9999991  0.268567  0.191218  0.264862  0.264924  0.207662  0.269034\n9999992  0.130335  0.190253  0.217734  0.207759  0.218057  0.216466\n9999993  0.264062  0.210552  0.287227  0.312371  0.261119  0.309268\n9999994  0.216972  0.238658  0.400518  0.279897  0.237453  0.289573\n9999995  0.160229  0.262126  0.279217  0.223468  0.307646  0.356853\n9999996  0.197098  0.245487  0.281672  0.224826  0.262286  0.237049\n9999997  0.336345  0.288883  0.397440  0.365541  0.385385  0.345675\n9999998  0.077009  0.111480  0.237325  0.151835  0.055440  0.180989\n9999999  0.245661  0.186862  0.353266  0.165128  0.187926  0.213783\n\n[10000000 rows x 6 columns]","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>esim</th>\n      <th>lcnn</th>\n      <th>cnn</th>\n      <th>tfidf</th>\n      <th>siamese</th>\n      <th>rnn</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0.183343</td>\n      <td>0.210926</td>\n      <td>0.178435</td>\n      <td>0.166411</td>\n      <td>0.041594</td>\n      <td>0.114207</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>0.127543</td>\n      <td>0.170234</td>\n      <td>0.159299</td>\n      <td>0.193318</td>\n      <td>0.068709</td>\n      <td>0.084089</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>0.256977</td>\n      <td>0.220685</td>\n      <td>0.304244</td>\n      <td>0.245783</td>\n      <td>0.189384</td>\n      <td>0.124110</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>0.329264</td>\n      <td>0.252470</td>\n      <td>0.316229</td>\n      <td>0.294219</td>\n      <td>0.176245</td>\n      <td>0.180180</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>0.293926</td>\n      <td>0.269850</td>\n      <td>0.330358</td>\n      <td>0.306480</td>\n      <td>0.137237</td>\n      <td>0.243230</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>0.277756</td>\n      <td>0.234906</td>\n      <td>0.306567</td>\n      <td>0.254959</td>\n      <td>0.239885</td>\n      <td>0.185155</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>0.290975</td>\n      <td>0.236100</td>\n      <td>0.287141</td>\n      <td>0.261831</td>\n      <td>0.225257</td>\n      <td>0.182645</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>0.281233</td>\n      <td>0.220378</td>\n      <td>0.278347</td>\n      <td>0.246527</td>\n      <td>0.227422</td>\n      <td>0.155311</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>0.073633</td>\n      <td>0.093988</td>\n      <td>0.102847</td>\n      <td>0.091150</td>\n      <td>0.037049</td>\n      <td>0.126172</td>\n    </tr>\n    <tr>\n      <th>9</th>\n      <td>0.099157</td>\n      <td>0.087732</td>\n      <td>0.093161</td>\n      <td>0.092697</td>\n      <td>0.108357</td>\n      <td>0.091863</td>\n    </tr>\n    <tr>\n      <th>10</th>\n      <td>0.061796</td>\n      <td>0.105573</td>\n      <td>0.143059</td>\n      <td>0.097542</td>\n      <td>0.069790</td>\n      <td>0.120047</td>\n    </tr>\n    <tr>\n      <th>11</th>\n      <td>0.123377</td>\n      <td>0.140807</td>\n      <td>0.150359</td>\n      <td>0.128166</td>\n      <td>0.102603</td>\n      <td>0.160087</td>\n    </tr>\n    <tr>\n      <th>12</th>\n      <td>0.293368</td>\n      <td>0.201807</td>\n      <td>0.293641</td>\n      <td>0.228811</td>\n      <td>0.204723</td>\n      <td>0.189470</td>\n    </tr>\n    <tr>\n      <th>13</th>\n      <td>0.078154</td>\n      <td>0.131898</td>\n      <td>0.118266</td>\n      <td>0.170003</td>\n      <td>0.100291</td>\n      <td>0.111570</td>\n    </tr>\n    <tr>\n      <th>14</th>\n      <td>0.398148</td>\n      <td>0.255469</td>\n      <td>0.377622</td>\n      <td>0.380209</td>\n      <td>0.403914</td>\n      <td>0.382533</td>\n    </tr>\n    <tr>\n      <th>15</th>\n      <td>0.165693</td>\n      <td>0.080638</td>\n      <td>0.220998</td>\n      <td>0.142363</td>\n      <td>0.109738</td>\n      <td>0.101367</td>\n    </tr>\n    <tr>\n      <th>16</th>\n      <td>0.328428</td>\n      <td>0.240529</td>\n      <td>0.361178</td>\n      <td>0.405237</td>\n      <td>0.330032</td>\n      <td>0.352683</td>\n    </tr>\n    <tr>\n      <th>17</th>\n      <td>0.380304</td>\n      <td>0.273786</td>\n      <td>0.344722</td>\n      <td>0.362329</td>\n      <td>0.290256</td>\n      <td>0.350222</td>\n    </tr>\n    <tr>\n      <th>18</th>\n      <td>0.349416</td>\n      <td>0.257773</td>\n      <td>0.317110</td>\n      <td>0.287808</td>\n      <td>0.229717</td>\n      <td>0.355011</td>\n    </tr>\n    <tr>\n      <th>19</th>\n      <td>0.420683</td>\n      <td>0.249101</td>\n      <td>0.368600</td>\n      <td>0.310864</td>\n      <td>0.286035</td>\n      <td>0.316092</td>\n    </tr>\n    <tr>\n      <th>20</th>\n      <td>0.395232</td>\n      <td>0.238495</td>\n      <td>0.383211</td>\n      <td>0.401805</td>\n      <td>0.355147</td>\n      <td>0.359722</td>\n    </tr>\n    <tr>\n      <th>21</th>\n      <td>0.063580</td>\n      <td>0.077431</td>\n      <td>0.116238</td>\n      <td>0.107871</td>\n      <td>0.057790</td>\n      <td>0.147502</td>\n    </tr>\n    <tr>\n      <th>22</th>\n      <td>0.047393</td>\n      <td>0.027277</td>\n      <td>0.068281</td>\n      <td>0.041348</td>\n      <td>0.064397</td>\n      <td>0.060155</td>\n    </tr>\n    <tr>\n      <th>23</th>\n      <td>0.108533</td>\n      <td>0.091941</td>\n      <td>0.134770</td>\n      <td>0.142344</td>\n      <td>0.057161</td>\n      <td>0.142447</td>\n    </tr>\n    <tr>\n      <th>24</th>\n      <td>0.022149</td>\n      <td>0.028021</td>\n      <td>0.038434</td>\n      <td>0.055003</td>\n      <td>0.072808</td>\n      <td>0.085148</td>\n    </tr>\n    <tr>\n      <th>25</th>\n      <td>0.063761</td>\n      <td>0.085595</td>\n      <td>0.101943</td>\n      <td>0.130718</td>\n      <td>0.026504</td>\n      <td>0.120189</td>\n    </tr>\n    <tr>\n      <th>26</th>\n      <td>0.252082</td>\n      <td>0.250803</td>\n      <td>0.239114</td>\n      <td>0.243267</td>\n      <td>0.274929</td>\n      <td>0.194358</td>\n    </tr>\n    <tr>\n      <th>27</th>\n      <td>0.111844</td>\n      <td>0.102552</td>\n      <td>0.111934</td>\n      <td>0.119268</td>\n      <td>0.056281</td>\n      <td>0.172390</td>\n    </tr>\n    <tr>\n      <th>28</th>\n      <td>0.043412</td>\n      <td>0.074418</td>\n      <td>0.086375</td>\n      <td>0.084705</td>\n      <td>0.025406</td>\n      <td>0.127015</td>\n    </tr>\n    <tr>\n      <th>29</th>\n      <td>0.033502</td>\n      <td>0.044718</td>\n      <td>0.080294</td>\n      <td>0.068978</td>\n      <td>0.030750</td>\n      <td>0.103222</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>9999970</th>\n      <td>0.165128</td>\n      <td>0.157426</td>\n      <td>0.159998</td>\n      <td>0.192329</td>\n      <td>0.211357</td>\n      <td>0.145061</td>\n    </tr>\n    <tr>\n      <th>9999971</th>\n      <td>0.207487</td>\n      <td>0.181840</td>\n      <td>0.245273</td>\n      <td>0.166036</td>\n      <td>0.198161</td>\n      <td>0.185849</td>\n    </tr>\n    <tr>\n      <th>9999972</th>\n      <td>0.165988</td>\n      <td>0.149326</td>\n      <td>0.175779</td>\n      <td>0.150465</td>\n      <td>0.196418</td>\n      <td>0.159729</td>\n    </tr>\n    <tr>\n      <th>9999973</th>\n      <td>0.139762</td>\n      <td>0.222923</td>\n      <td>0.307985</td>\n      <td>0.183440</td>\n      <td>0.266340</td>\n      <td>0.181391</td>\n    </tr>\n    <tr>\n      <th>9999974</th>\n      <td>0.215708</td>\n      <td>0.180050</td>\n      <td>0.178238</td>\n      <td>0.248254</td>\n      <td>0.186034</td>\n      <td>0.244965</td>\n    </tr>\n    <tr>\n      <th>9999975</th>\n      <td>0.081802</td>\n      <td>0.115591</td>\n      <td>0.118820</td>\n      <td>0.108942</td>\n      <td>0.195108</td>\n      <td>0.175034</td>\n    </tr>\n    <tr>\n      <th>9999976</th>\n      <td>0.186093</td>\n      <td>0.193749</td>\n      <td>0.389246</td>\n      <td>0.194351</td>\n      <td>0.275788</td>\n      <td>0.216140</td>\n    </tr>\n    <tr>\n      <th>9999977</th>\n      <td>0.290551</td>\n      <td>0.201620</td>\n      <td>0.285007</td>\n      <td>0.314757</td>\n      <td>0.117171</td>\n      <td>0.258349</td>\n    </tr>\n    <tr>\n      <th>9999978</th>\n      <td>0.166379</td>\n      <td>0.162729</td>\n      <td>0.227634</td>\n      <td>0.201888</td>\n      <td>0.082588</td>\n      <td>0.182384</td>\n    </tr>\n    <tr>\n      <th>9999979</th>\n      <td>0.070790</td>\n      <td>0.069349</td>\n      <td>0.106155</td>\n      <td>0.111007</td>\n      <td>0.051354</td>\n      <td>0.147986</td>\n    </tr>\n    <tr>\n      <th>9999980</th>\n      <td>0.034841</td>\n      <td>0.029093</td>\n      <td>0.030690</td>\n      <td>0.041065</td>\n      <td>0.014371</td>\n      <td>0.020332</td>\n    </tr>\n    <tr>\n      <th>9999981</th>\n      <td>0.122875</td>\n      <td>0.151999</td>\n      <td>0.180923</td>\n      <td>0.184422</td>\n      <td>0.186763</td>\n      <td>0.167568</td>\n    </tr>\n    <tr>\n      <th>9999982</th>\n      <td>0.163226</td>\n      <td>0.206355</td>\n      <td>0.213456</td>\n      <td>0.298012</td>\n      <td>0.154435</td>\n      <td>0.302324</td>\n    </tr>\n    <tr>\n      <th>9999983</th>\n      <td>0.076904</td>\n      <td>0.075969</td>\n      <td>0.060056</td>\n      <td>0.106338</td>\n      <td>0.070358</td>\n      <td>0.116356</td>\n    </tr>\n    <tr>\n      <th>9999984</th>\n      <td>0.370937</td>\n      <td>0.293974</td>\n      <td>0.369704</td>\n      <td>0.378480</td>\n      <td>0.271502</td>\n      <td>0.333590</td>\n    </tr>\n    <tr>\n      <th>9999985</th>\n      <td>0.261797</td>\n      <td>0.218071</td>\n      <td>0.206922</td>\n      <td>0.329579</td>\n      <td>0.266202</td>\n      <td>0.196525</td>\n    </tr>\n    <tr>\n      <th>9999986</th>\n      <td>0.343210</td>\n      <td>0.266212</td>\n      <td>0.248553</td>\n      <td>0.236931</td>\n      <td>0.233802</td>\n      <td>0.273509</td>\n    </tr>\n    <tr>\n      <th>9999987</th>\n      <td>0.353125</td>\n      <td>0.252692</td>\n      <td>0.301002</td>\n      <td>0.322517</td>\n      <td>0.236588</td>\n      <td>0.182779</td>\n    </tr>\n    <tr>\n      <th>9999988</th>\n      <td>0.276885</td>\n      <td>0.201802</td>\n      <td>0.225492</td>\n      <td>0.244956</td>\n      <td>0.188742</td>\n      <td>0.135402</td>\n    </tr>\n    <tr>\n      <th>9999989</th>\n      <td>0.287292</td>\n      <td>0.227126</td>\n      <td>0.274997</td>\n      <td>0.305922</td>\n      <td>0.196734</td>\n      <td>0.181266</td>\n    </tr>\n    <tr>\n      <th>9999990</th>\n      <td>0.069087</td>\n      <td>0.082701</td>\n      <td>0.126623</td>\n      <td>0.107481</td>\n      <td>0.045689</td>\n      <td>0.122898</td>\n    </tr>\n    <tr>\n      <th>9999991</th>\n      <td>0.268567</td>\n      <td>0.191218</td>\n      <td>0.264862</td>\n      <td>0.264924</td>\n      <td>0.207662</td>\n      <td>0.269034</td>\n    </tr>\n    <tr>\n      <th>9999992</th>\n      <td>0.130335</td>\n      <td>0.190253</td>\n      <td>0.217734</td>\n      <td>0.207759</td>\n      <td>0.218057</td>\n      <td>0.216466</td>\n    </tr>\n    <tr>\n      <th>9999993</th>\n      <td>0.264062</td>\n      <td>0.210552</td>\n      <td>0.287227</td>\n      <td>0.312371</td>\n      <td>0.261119</td>\n      <td>0.309268</td>\n    </tr>\n    <tr>\n      <th>9999994</th>\n      <td>0.216972</td>\n      <td>0.238658</td>\n      <td>0.400518</td>\n      <td>0.279897</td>\n      <td>0.237453</td>\n      <td>0.289573</td>\n    </tr>\n    <tr>\n      <th>9999995</th>\n      <td>0.160229</td>\n      <td>0.262126</td>\n      <td>0.279217</td>\n      <td>0.223468</td>\n      <td>0.307646</td>\n      <td>0.356853</td>\n    </tr>\n    <tr>\n      <th>9999996</th>\n      <td>0.197098</td>\n      <td>0.245487</td>\n      <td>0.281672</td>\n      <td>0.224826</td>\n      <td>0.262286</td>\n      <td>0.237049</td>\n    </tr>\n    <tr>\n      <th>9999997</th>\n      <td>0.336345</td>\n      <td>0.288883</td>\n      <td>0.397440</td>\n      <td>0.365541</td>\n      <td>0.385385</td>\n      <td>0.345675</td>\n    </tr>\n    <tr>\n      <th>9999998</th>\n      <td>0.077009</td>\n      <td>0.111480</td>\n      <td>0.237325</td>\n      <td>0.151835</td>\n      <td>0.055440</td>\n      <td>0.180989</td>\n    </tr>\n    <tr>\n      <th>9999999</th>\n      <td>0.245661</td>\n      <td>0.186862</td>\n      <td>0.353266</td>\n      <td>0.165128</td>\n      <td>0.187926</td>\n      <td>0.213783</td>\n    </tr>\n  </tbody>\n</table>\n<p>10000000 rows × 6 columns</p>\n</div>"},"execution_count":3}],"source":"all_blending_train.columns=['esim','lcnn','cnn','tfidf','siamese','rnn']\nall_blending_train","execution_count":3},{"metadata":{"id":"95710FD253174289B1F3B384AD3BA4F4","collapsed":false,"scrolled":false},"cell_type":"code","outputs":[{"output_type":"execute_result","metadata":{},"data":{"text/plain":"LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,\n         normalize=False)"},"execution_count":4}],"source":"import pandas as pd\r\nimport os\r\nfrom sklearn.model_selection import train_test_split\r\nfrom sklearn import metrics\r\nimport numpy as np\r\nfrom sklearn.linear_model import LinearRegression\r\n\r\nlinreg = LinearRegression()\r\nlinreg.fit(all_blending_train, label)","execution_count":4},{"metadata":{"id":"FD18C0F0C2A64D5A91BD6555C91D0D12","mdEditEnable":false},"cell_type":"markdown","source":"# 打印相关系数"},{"metadata":{"id":"C69B45BE567D4928957ED99DE54137FD","collapsed":false,"scrolled":false},"cell_type":"code","outputs":[{"output_type":"stream","text":"-0.018435568\n[0.10604735 0.02851146 0.25581202 0.22829965 0.00853581 0.15597415\n 0.32563168]\n","name":"stdout"}],"source":"print (linreg.intercept_)\r\nprint (linreg.coef_)","execution_count":67},{"metadata":{"id":"C8C56000E50340C6BD4AB7039AC34867","mdEditEnable":false},"cell_type":"markdown","source":"# 加载测试集"},{"metadata":{"id":"5C72CF15202A45BBB5A3DC1D3F094F7F","collapsed":false,"scrolled":false},"cell_type":"code","outputs":[],"source":"import pandas as pd\n# esim_test=pd.read_pickle(\"/home/kesci/test_old_result/esim_fea_chunwenben.pickle\")\n# lcnn_test=pd.read_pickle(\"/home/kesci/test_old_result/lcnn_fea_chunwenben.pickle\")\n# cnn_test=pd.read_pickle(\"/home/kesci/test_old_result/textcnn_fea_chunwenben.pickle\")\n# tfidf_test=pd.read_pickle(\"/home/kesci/test_old_result/textcnn_tfidf_fea_chunwenben.pickle\")\n# siamese_test=pd.read_pickle(\"/home/kesci/test_old_result/siamese_fea_chunwenben.pickle\")\n# rnn_test=pd.read_pickle(\"/home/kesci/test_old_result/textcnn_rnn_fea_chunwenben.pickle\")\n\nesim_test=pd.read_pickle(\"/home/kesci/test_old_result/esim_fea_chunwenben.pickle\")\nlcnn_test=pd.read_pickle(\"/home/kesci/test_old_result/lcnn_fea_chunwenben.pickle\")\ncnn_test=pd.read_pickle(\"/home/kesci/test_old_result/textcnn_fea_chunwenben.pickle\")\ntfidf_test=pd.read_pickle(\"/home/kesci/test_old_result/textcnn_tfidf_fea_chunwenben.pickle\")\nsiamese_test=pd.read_pickle(\"/home/kesci/test_old_result/siamese_fea_chunwenben.pickle\")\nrnn_test=pd.read_pickle(\"/home/kesci/test_old_result/textcnn_rnn_fea_chunwenben.pickle\")\n# sia_have_ctr_test=pd.read_pickle(\"/home/kesci/test_old_result/siamese_ln230000.pickle\")\nsia_no_ctr_test=pd.read_pickle(\"/home/kesci/test_old_result/siamese_fea_no_ctr.pickle\")\n# lgb_test=pd.read_pickle(\"/home/kesci/test_old_result/lgb_1200.pickle\")\n","execution_count":68},{"metadata":{"id":"8684F61E50CE4F55885EA57629A108B1","collapsed":false,"scrolled":false},"cell_type":"code","outputs":[],"source":"all_blending_test=pd.concat([esim_test,lcnn_test,cnn_test,tfidf_test,siamese_test,rnn_test,sia_no_ctr_test],axis=1)\nall_blending_test.columns=['esim','lcnn','cnn','tfidf','siamese','rnn','sia_no_ctr']","execution_count":69},{"metadata":{"id":"924A7799CB12429081C929D8B5A6D4DE","collapsed":false,"scrolled":false},"cell_type":"code","outputs":[],"source":"preds = linreg.predict(all_blending_test)","execution_count":70},{"metadata":{"id":"A372C676A0D34A83B9FDD1888D65F659","collapsed":false,"scrolled":false},"cell_type":"code","outputs":[{"output_type":"execute_result","metadata":{},"data":{"text/plain":"0.18898095689758024"},"execution_count":19}],"source":"preds.mean()","execution_count":19},{"metadata":{"id":"40844906499641BE9F7DD73E1B629E87","mdEditEnable":false},"cell_type":"markdown","source":"# 下面是提交预测结果"},{"metadata":{"id":"38165C0F339548FD92A398200A92E30C","collapsed":true,"scrolled":false},"cell_type":"code","outputs":[{"output_type":"execute_result","metadata":{},"data":{"text/plain":"                 0\n0         0.090642\n1         0.154695\n2         0.124515\n3         0.181585\n4         0.083189\n5         0.062321\n6         0.147699\n7         0.308451\n8         0.265529\n9         0.182372\n10        0.161379\n11        0.360521\n12        0.264380\n13        0.346459\n14        0.345655\n15        0.291782\n16        0.359122\n17        0.227888\n18        0.292287\n19        0.326756\n20        0.123003\n21        0.065921\n22        0.195623\n23        0.294173\n24        0.272112\n25        0.565019\n26        0.391905\n27        0.335000\n28        0.235103\n29        0.273844\n...            ...\n19999970  0.316614\n19999971  0.263674\n19999972  0.197587\n19999973  0.142799\n19999974  0.128415\n19999975  0.158825\n19999976  0.236127\n19999977  0.178507\n19999978  0.157281\n19999979  0.191018\n19999980  0.140344\n19999981  0.155093\n19999982  0.225528\n19999983  0.210799\n19999984  0.222960\n19999985  0.107509\n19999986  0.195245\n19999987  0.118376\n19999988  0.193199\n19999989  0.222258\n19999990  0.220496\n19999991  0.131090\n19999992  0.133240\n19999993  0.112392\n19999994  0.197419\n19999995  0.245495\n19999996  0.181448\n19999997  0.213272\n19999998  0.184392\n19999999  0.210809\n\n[20000000 rows x 1 columns]","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>0</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0.090642</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>0.154695</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>0.124515</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>0.181585</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>0.083189</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>0.062321</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>0.147699</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>0.308451</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>0.265529</td>\n    </tr>\n    <tr>\n      <th>9</th>\n      <td>0.182372</td>\n    </tr>\n    <tr>\n      <th>10</th>\n      <td>0.161379</td>\n    </tr>\n    <tr>\n      <th>11</th>\n      <td>0.360521</td>\n    </tr>\n    <tr>\n      <th>12</th>\n      <td>0.264380</td>\n    </tr>\n    <tr>\n      <th>13</th>\n      <td>0.346459</td>\n    </tr>\n    <tr>\n      <th>14</th>\n      <td>0.345655</td>\n    </tr>\n    <tr>\n      <th>15</th>\n      <td>0.291782</td>\n    </tr>\n    <tr>\n      <th>16</th>\n      <td>0.359122</td>\n    </tr>\n    <tr>\n      <th>17</th>\n      <td>0.227888</td>\n    </tr>\n    <tr>\n      <th>18</th>\n      <td>0.292287</td>\n    </tr>\n    <tr>\n      <th>19</th>\n      <td>0.326756</td>\n    </tr>\n    <tr>\n      <th>20</th>\n      <td>0.123003</td>\n    </tr>\n    <tr>\n      <th>21</th>\n      <td>0.065921</td>\n    </tr>\n    <tr>\n      <th>22</th>\n      <td>0.195623</td>\n    </tr>\n    <tr>\n      <th>23</th>\n      <td>0.294173</td>\n    </tr>\n    <tr>\n      <th>24</th>\n      <td>0.272112</td>\n    </tr>\n    <tr>\n      <th>25</th>\n      <td>0.565019</td>\n    </tr>\n    <tr>\n      <th>26</th>\n      <td>0.391905</td>\n    </tr>\n    <tr>\n      <th>27</th>\n      <td>0.335000</td>\n    </tr>\n    <tr>\n      <th>28</th>\n      <td>0.235103</td>\n    </tr>\n    <tr>\n      <th>29</th>\n      <td>0.273844</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>19999970</th>\n      <td>0.316614</td>\n    </tr>\n    <tr>\n      <th>19999971</th>\n      <td>0.263674</td>\n    </tr>\n    <tr>\n      <th>19999972</th>\n      <td>0.197587</td>\n    </tr>\n    <tr>\n      <th>19999973</th>\n      <td>0.142799</td>\n    </tr>\n    <tr>\n      <th>19999974</th>\n      <td>0.128415</td>\n    </tr>\n    <tr>\n      <th>19999975</th>\n      <td>0.158825</td>\n    </tr>\n    <tr>\n      <th>19999976</th>\n      <td>0.236127</td>\n    </tr>\n    <tr>\n      <th>19999977</th>\n      <td>0.178507</td>\n    </tr>\n    <tr>\n      <th>19999978</th>\n      <td>0.157281</td>\n    </tr>\n    <tr>\n      <th>19999979</th>\n      <td>0.191018</td>\n    </tr>\n    <tr>\n      <th>19999980</th>\n      <td>0.140344</td>\n    </tr>\n    <tr>\n      <th>19999981</th>\n      <td>0.155093</td>\n    </tr>\n    <tr>\n      <th>19999982</th>\n      <td>0.225528</td>\n    </tr>\n    <tr>\n      <th>19999983</th>\n      <td>0.210799</td>\n    </tr>\n    <tr>\n      <th>19999984</th>\n      <td>0.222960</td>\n    </tr>\n    <tr>\n      <th>19999985</th>\n      <td>0.107509</td>\n    </tr>\n    <tr>\n      <th>19999986</th>\n      <td>0.195245</td>\n    </tr>\n    <tr>\n      <th>19999987</th>\n      <td>0.118376</td>\n    </tr>\n    <tr>\n      <th>19999988</th>\n      <td>0.193199</td>\n    </tr>\n    <tr>\n      <th>19999989</th>\n      <td>0.222258</td>\n    </tr>\n    <tr>\n      <th>19999990</th>\n      <td>0.220496</td>\n    </tr>\n    <tr>\n      <th>19999991</th>\n      <td>0.131090</td>\n    </tr>\n    <tr>\n      <th>19999992</th>\n      <td>0.133240</td>\n    </tr>\n    <tr>\n      <th>19999993</th>\n      <td>0.112392</td>\n    </tr>\n    <tr>\n      <th>19999994</th>\n      <td>0.197419</td>\n    </tr>\n    <tr>\n      <th>19999995</th>\n      <td>0.245495</td>\n    </tr>\n    <tr>\n      <th>19999996</th>\n      <td>0.181448</td>\n    </tr>\n    <tr>\n      <th>19999997</th>\n      <td>0.213272</td>\n    </tr>\n    <tr>\n      <th>19999998</th>\n      <td>0.184392</td>\n    </tr>\n    <tr>\n      <th>19999999</th>\n      <td>0.210809</td>\n    </tr>\n  </tbody>\n</table>\n<p>20000000 rows × 1 columns</p>\n</div>"},"execution_count":21}],"source":"preds=pd.DataFrame(preds)\npreds","execution_count":21},{"metadata":{"id":"684610B47BD2496693F83CBBAF7CCD76","collapsed":true,"scrolled":false},"cell_type":"code","outputs":[{"output_type":"stream","text":"0\n1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n11\n12\n13\n14\n15\n9.903692245483398\n0.0001571178436279297\n","name":"stdout"}],"source":"data_r_path='/home/kesci/input/bytedance/test_final_part1.csv'\r\ncpu_num=16\r\nstart=0\r\nall_data_num=20000000\r\nnames_out=['query_id','query','query_title_id','title']\r\nusecols_out=['query_id','query_title_id']\r\nhas_head=False\r\nif has_head==False:\r\n    import multiprocessing\r\n    from time import time\r\n    import pandas as pd\r\n    all_data=[]\r\n    #test_final_part1.csv\r\n    def data_read(start,single_data_num,data_real_path):\r\n        data_out=pd.read_csv(data_real_path,header=None,names=names_out,usecols=usecols_out,skiprows=start,nrows=single_data_num)\r\n        return data_out\r\n    time1=time()\r\n    pool = multiprocessing.Pool(processes=cpu_num)\r\n    \r\n    for epoch in range(int(cpu_num)):\r\n        print(epoch)\r\n        single_data_num=int(all_data_num/cpu_num)\r\n        all_data.append(pool.apply_async(data_read, [start+single_data_num*epoch,single_data_num,data_r_path]))\r\n    # single_data_num=20000000/16\r\n    # all_data_num=20000000\r\n    # for epoch in range(int(all_data_num/single_data_num)):\r\n    #     all_data.append(pool.apply_async(data_read, [single_data_num*epoch,single_data_num,data_path+\"test_final_part1.csv\"]))\r\n    pool.close()\r\n    pool.join()\r\n    time2=time()\r\n    print(time2-time1)\r\n    all_data_pro=[single.get() for single in all_data]\r\n    time3=time()\r\n    print(time3-time2)\r\n    all_data_pro=pd.concat(all_data_pro)\r\n    test_data_pred_need=all_data_pro.reset_index(drop=True)\r\n\r\n\r\nresult=pd.concat([test_data_pred_need,preds],axis=1)\r\nresult.to_csv(\"first_zzp/result/sub_logestic_nn_allchunwenben.csv\",header=None,index=None)#####合成三列做 最终的提交结果","execution_count":22},{"metadata":{"id":"96E17AF399A54BA7A3DBA7CC7A277307","collapsed":false,"scrolled":false},"cell_type":"code","outputs":[],"source":"result.to_csv(\"first_zzp/result/sub_logestic_nn_allchunwenben_lgb_sia.csv\",header=None,index=None)","execution_count":14},{"metadata":{"id":"FDD5A534DD25448A89D5E019CFF4451D","collapsed":false,"scrolled":false},"cell_type":"code","outputs":[],"source":"lgb=pd.read_csv(\"/home/kesci/work/first_zzp/result/sub_new_lsi.csv\",header=None)\nresult[0]=(result[0]+lgb[2])/2","execution_count":13},{"metadata":{"id":"1FA468286EFA4227925268B0FAE57C08","collapsed":false,"scrolled":false},"cell_type":"code","outputs":[{"output_type":"stream","text":"wget: /opt/conda/lib/libcrypto.so.1.0.0: no version information available (required by wget)\nwget: /opt/conda/lib/libssl.so.1.0.0: no version information available (required by wget)\nwget: /opt/conda/lib/libssl.so.1.0.0: no version information available (required by wget)\n--2019-08-10 17:59:26--  https://www.heywhale.com/kesci_submit\nResolving www.heywhale.com (www.heywhale.com)... 106.15.25.147\nConnecting to www.heywhale.com (www.heywhale.com)|106.15.25.147|:443... connected.\nHTTP request sent, awaiting response... 200 OK\nLength: 6709558 (6.4M) [application/octet-stream]\nSaving to: ‘kesci_submit’\n\nkesci_submit        100%[===================>]   6.40M  18.7MB/s    in 0.3s    \n\n2019-08-10 17:59:26 (18.7 MB/s) - ‘kesci_submit’ saved [6709558/6709558]\n\nKesci Submit Tool 3.2.1\n\n> 已验证Token\n> 提交文件 /home/kesci/work/first_zzp/result/sub_logestic_nn_allchunwenben.csv (575854.49 KiB)\n> 已上传 100 %\n> 文件已上传        \n> 服务器响应: 200 提交成功，请等待评审完成\n> 提交完成\n","name":"stdout"}],"source":"!wget -O kesci_submit https://www.heywhale.com/kesci_submit&&chmod +x kesci_submit\r\n!https_proxy=\"http://klab-external-proxy\" ./kesci_submit -file /home/kesci/work/first_zzp/result/sub_logestic_nn_allchunwenben.csv -token 02ada54c9760d3e1","execution_count":23},{"metadata":{"id":"68E6C00A005B4A109FCD000238E13AEF","mdEditEnable":false},"cell_type":"markdown","source":"# 以下是用lgb做blending"},{"metadata":{"id":"1913C1C465F740A68116360113B5D8FA","collapsed":false,"scrolled":true},"cell_type":"code","outputs":[{"output_type":"stream","text":"['esim', 'lcnn', 'cnn', 'tfidf', 'siamese', 'rnn']\nstart2\n","name":"stdout"},{"output_type":"stream","text":"/opt/conda/lib/python3.6/site-packages/lightgbm/engine.py:147: UserWarning: Found `num_boost_round` in params. Will use it instead of argument\n  warnings.warn(\"Found `{}` in params. Will use it instead of argument\".format(alias))\n/opt/conda/lib/python3.6/site-packages/lightgbm/basic.py:755: UserWarning: silent keyword has been found in `params` and will be ignored.\nPlease use silent argument of the Dataset constructor to pass this parameter.\n  .format(key))\n","name":"stderr"},{"output_type":"stream","text":"[2]\tvalid_0's binary_logloss: 0.470343\n[4]\tvalid_0's binary_logloss: 0.469461\n[6]\tvalid_0's binary_logloss: 0.468591\n[8]\tvalid_0's binary_logloss: 0.467756\n[10]\tvalid_0's binary_logloss: 0.466952\n[12]\tvalid_0's binary_logloss: 0.466171\n[14]\tvalid_0's binary_logloss: 0.465425\n[16]\tvalid_0's binary_logloss: 0.46471\n[18]\tvalid_0's binary_logloss: 0.464027\n[20]\tvalid_0's binary_logloss: 0.463368\n[22]\tvalid_0's binary_logloss: 0.462727\n[24]\tvalid_0's binary_logloss: 0.462116\n[26]\tvalid_0's binary_logloss: 0.461541\n[28]\tvalid_0's binary_logloss: 0.460974\n[30]\tvalid_0's binary_logloss: 0.460427\n[32]\tvalid_0's binary_logloss: 0.459909\n[34]\tvalid_0's binary_logloss: 0.459406\n[36]\tvalid_0's binary_logloss: 0.458911\n[38]\tvalid_0's binary_logloss: 0.458434\n[40]\tvalid_0's binary_logloss: 0.457982\n[42]\tvalid_0's binary_logloss: 0.457546\n[44]\tvalid_0's binary_logloss: 0.457119\n[46]\tvalid_0's binary_logloss: 0.456713\n[48]\tvalid_0's binary_logloss: 0.456323\n[50]\tvalid_0's binary_logloss: 0.455947\n[52]\tvalid_0's binary_logloss: 0.455587\n[54]\tvalid_0's binary_logloss: 0.455235\n[56]\tvalid_0's binary_logloss: 0.454898\n[58]\tvalid_0's binary_logloss: 0.454569\n[60]\tvalid_0's binary_logloss: 0.454252\n[62]\tvalid_0's binary_logloss: 0.453947\n[64]\tvalid_0's binary_logloss: 0.453651\n[66]\tvalid_0's binary_logloss: 0.453367\n[68]\tvalid_0's binary_logloss: 0.453095\n[70]\tvalid_0's binary_logloss: 0.452834\n[72]\tvalid_0's binary_logloss: 0.452581\n[74]\tvalid_0's binary_logloss: 0.452334\n[76]\tvalid_0's binary_logloss: 0.452098\n[78]\tvalid_0's binary_logloss: 0.45187\n[80]\tvalid_0's binary_logloss: 0.451646\n[82]\tvalid_0's binary_logloss: 0.451434\n[84]\tvalid_0's binary_logloss: 0.451227\n[86]\tvalid_0's binary_logloss: 0.451025\n[88]\tvalid_0's binary_logloss: 0.45083\n[90]\tvalid_0's binary_logloss: 0.450643\n[92]\tvalid_0's binary_logloss: 0.450463\n[94]\tvalid_0's binary_logloss: 0.450287\n[96]\tvalid_0's binary_logloss: 0.450118\n[98]\tvalid_0's binary_logloss: 0.449955\n[100]\tvalid_0's binary_logloss: 0.449796\n[102]\tvalid_0's binary_logloss: 0.449644\n[104]\tvalid_0's binary_logloss: 0.4495\n[106]\tvalid_0's binary_logloss: 0.449358\n[108]\tvalid_0's binary_logloss: 0.44922\n[110]\tvalid_0's binary_logloss: 0.449088\n[112]\tvalid_0's binary_logloss: 0.44896\n[114]\tvalid_0's binary_logloss: 0.448838\n[116]\tvalid_0's binary_logloss: 0.448718\n[118]\tvalid_0's binary_logloss: 0.448602\n[120]\tvalid_0's binary_logloss: 0.44849\n[122]\tvalid_0's binary_logloss: 0.448383\n[124]\tvalid_0's binary_logloss: 0.44828\n[126]\tvalid_0's binary_logloss: 0.448179\n[128]\tvalid_0's binary_logloss: 0.448082\n[130]\tvalid_0's binary_logloss: 0.447988\n[132]\tvalid_0's binary_logloss: 0.447898\n[134]\tvalid_0's binary_logloss: 0.447811\n[136]\tvalid_0's binary_logloss: 0.447725\n[138]\tvalid_0's binary_logloss: 0.447642\n[140]\tvalid_0's binary_logloss: 0.447564\n[142]\tvalid_0's binary_logloss: 0.447488\n[144]\tvalid_0's binary_logloss: 0.447414\n[146]\tvalid_0's binary_logloss: 0.447342\n[148]\tvalid_0's binary_logloss: 0.447273\n[150]\tvalid_0's binary_logloss: 0.447207\n[152]\tvalid_0's binary_logloss: 0.447142\n[154]\tvalid_0's binary_logloss: 0.44708\n[156]\tvalid_0's binary_logloss: 0.447021\n[158]\tvalid_0's binary_logloss: 0.446963\n[160]\tvalid_0's binary_logloss: 0.446908\n[162]\tvalid_0's binary_logloss: 0.446853\n[164]\tvalid_0's binary_logloss: 0.446801\n[166]\tvalid_0's binary_logloss: 0.44675\n[168]\tvalid_0's binary_logloss: 0.446703\n[170]\tvalid_0's binary_logloss: 0.446655\n[172]\tvalid_0's binary_logloss: 0.446609\n[174]\tvalid_0's binary_logloss: 0.446565\n[176]\tvalid_0's binary_logloss: 0.446523\n[178]\tvalid_0's binary_logloss: 0.446481\n[180]\tvalid_0's binary_logloss: 0.446442\n[182]\tvalid_0's binary_logloss: 0.446403\n[184]\tvalid_0's binary_logloss: 0.446366\n[186]\tvalid_0's binary_logloss: 0.44633\n[188]\tvalid_0's binary_logloss: 0.446295\n[190]\tvalid_0's binary_logloss: 0.446262\n[192]\tvalid_0's binary_logloss: 0.446228\n[194]\tvalid_0's binary_logloss: 0.446197\n[196]\tvalid_0's binary_logloss: 0.446167\n[198]\tvalid_0's binary_logloss: 0.446137\n[200]\tvalid_0's binary_logloss: 0.446108\n[202]\tvalid_0's binary_logloss: 0.446081\n[204]\tvalid_0's binary_logloss: 0.446055\n[206]\tvalid_0's binary_logloss: 0.446029\n[208]\tvalid_0's binary_logloss: 0.446004\n[210]\tvalid_0's binary_logloss: 0.44598\n[212]\tvalid_0's binary_logloss: 0.445957\n[214]\tvalid_0's binary_logloss: 0.445934\n[216]\tvalid_0's binary_logloss: 0.445913\n[218]\tvalid_0's binary_logloss: 0.445891\n[220]\tvalid_0's binary_logloss: 0.445871\n[222]\tvalid_0's binary_logloss: 0.445852\n[224]\tvalid_0's binary_logloss: 0.445833\n[226]\tvalid_0's binary_logloss: 0.445815\n[228]\tvalid_0's binary_logloss: 0.445798\n[230]\tvalid_0's binary_logloss: 0.445781\n[232]\tvalid_0's binary_logloss: 0.445765\n[234]\tvalid_0's binary_logloss: 0.445749\n[236]\tvalid_0's binary_logloss: 0.445734\n[238]\tvalid_0's binary_logloss: 0.44572\n[240]\tvalid_0's binary_logloss: 0.445705\n[242]\tvalid_0's binary_logloss: 0.445691\n[244]\tvalid_0's binary_logloss: 0.445677\n[246]\tvalid_0's binary_logloss: 0.445665\n[248]\tvalid_0's binary_logloss: 0.445652\n[250]\tvalid_0's binary_logloss: 0.445639\n[252]\tvalid_0's binary_logloss: 0.445627\n[254]\tvalid_0's binary_logloss: 0.445616\n[256]\tvalid_0's binary_logloss: 0.445605\n[258]\tvalid_0's binary_logloss: 0.445594\n[260]\tvalid_0's binary_logloss: 0.445584\n[262]\tvalid_0's binary_logloss: 0.445574\n[264]\tvalid_0's binary_logloss: 0.445565\n[266]\tvalid_0's binary_logloss: 0.445555\n[268]\tvalid_0's binary_logloss: 0.445546\n[270]\tvalid_0's binary_logloss: 0.445537\n[272]\tvalid_0's binary_logloss: 0.445529\n[274]\tvalid_0's binary_logloss: 0.445521\n[276]\tvalid_0's binary_logloss: 0.445513\n[278]\tvalid_0's binary_logloss: 0.445505\n[280]\tvalid_0's binary_logloss: 0.445498\n[282]\tvalid_0's binary_logloss: 0.445491\n[284]\tvalid_0's binary_logloss: 0.445484\n[286]\tvalid_0's binary_logloss: 0.445477\n[288]\tvalid_0's binary_logloss: 0.44547\n[290]\tvalid_0's binary_logloss: 0.445464\n[292]\tvalid_0's binary_logloss: 0.445458\n[294]\tvalid_0's binary_logloss: 0.445452\n[296]\tvalid_0's binary_logloss: 0.445446\n[298]\tvalid_0's binary_logloss: 0.445441\n[300]\tvalid_0's binary_logloss: 0.445435\n[302]\tvalid_0's binary_logloss: 0.44543\n[304]\tvalid_0's binary_logloss: 0.445425\n[306]\tvalid_0's binary_logloss: 0.44542\n[308]\tvalid_0's binary_logloss: 0.445415\n[310]\tvalid_0's binary_logloss: 0.445411\n[312]\tvalid_0's binary_logloss: 0.445406\n[314]\tvalid_0's binary_logloss: 0.445402\n[316]\tvalid_0's binary_logloss: 0.445398\n[318]\tvalid_0's binary_logloss: 0.445394\n[320]\tvalid_0's binary_logloss: 0.44539\n[322]\tvalid_0's binary_logloss: 0.445387\n[324]\tvalid_0's binary_logloss: 0.445383\n[326]\tvalid_0's binary_logloss: 0.445379\n[328]\tvalid_0's binary_logloss: 0.445376\n[330]\tvalid_0's binary_logloss: 0.445373\n[332]\tvalid_0's binary_logloss: 0.44537\n[334]\tvalid_0's binary_logloss: 0.445366\n[336]\tvalid_0's binary_logloss: 0.445363\n[338]\tvalid_0's binary_logloss: 0.44536\n[340]\tvalid_0's binary_logloss: 0.445357\n[342]\tvalid_0's binary_logloss: 0.445354\n[344]\tvalid_0's binary_logloss: 0.445351\n[346]\tvalid_0's binary_logloss: 0.445349\n[348]\tvalid_0's binary_logloss: 0.445346\n[350]\tvalid_0's binary_logloss: 0.445343\n[352]\tvalid_0's binary_logloss: 0.44534\n[354]\tvalid_0's binary_logloss: 0.445338\n[356]\tvalid_0's binary_logloss: 0.445336\n[358]\tvalid_0's binary_logloss: 0.445333\n[360]\tvalid_0's binary_logloss: 0.445331\n[362]\tvalid_0's binary_logloss: 0.445329\n[364]\tvalid_0's binary_logloss: 0.445327\n[366]\tvalid_0's binary_logloss: 0.445325\n[368]\tvalid_0's binary_logloss: 0.445323\n[370]\tvalid_0's binary_logloss: 0.445321\n[372]\tvalid_0's binary_logloss: 0.445319\n[374]\tvalid_0's binary_logloss: 0.445317\n[376]\tvalid_0's binary_logloss: 0.445315\n[378]\tvalid_0's binary_logloss: 0.445313\n[380]\tvalid_0's binary_logloss: 0.445311\n[382]\tvalid_0's binary_logloss: 0.44531\n[384]\tvalid_0's binary_logloss: 0.445308\n[386]\tvalid_0's binary_logloss: 0.445307\n[388]\tvalid_0's binary_logloss: 0.445305\n[390]\tvalid_0's binary_logloss: 0.445303\n[392]\tvalid_0's binary_logloss: 0.445302\n[394]\tvalid_0's binary_logloss: 0.4453\n[396]\tvalid_0's binary_logloss: 0.445299\n[398]\tvalid_0's binary_logloss: 0.445297\n[400]\tvalid_0's binary_logloss: 0.445296\n","name":"stdout"}],"source":"import pandas as pd\r\nimport numpy as np\r\nimport pandas as pd\r\nimport time\r\nimport sys\r\nimport datetime\r\nimport gc\r\nfrom sklearn.model_selection import StratifiedShuffleSplit\r\nfrom sklearn.model_selection import KFold, cross_val_score, train_test_split\r\nfrom sklearn.model_selection import StratifiedKFold\r\nfrom sklearn.metrics import roc_auc_score, log_loss\r\nimport lightgbm as lgb\r\nfrom sklearn.preprocessing import OneHotEncoder, LabelEncoder\r\nfrom sklearn.feature_extraction.text import CountVectorizer\r\nfrom sklearn.feature_selection import chi2, SelectPercentile\r\nimport math\r\nfrom sklearn.metrics import f1_score\r\nimport jieba\r\nimport jieba.posseg as psg\r\nfrom collections import Counter\r\nimport functools\r\nfrom time import time\r\nfrom sklearn import preprocessing\r\n\r\nimport pandas as pd\r\nfrom scipy import sparse\r\nfrom sklearn.model_selection import train_test_split, StratifiedKFold\r\nimport lightgbm as lgb\r\nfrom sklearn import metrics\r\nimport os, time, datetime\r\nimport numpy as np\r\nfrom sklearn import preprocessing\r\n\r\n\r\n\r\nfea=list(all_blending_train.columns)\r\nprint(fea)\r\ndata_split=StratifiedShuffleSplit(n_splits=2,test_size=0.05,random_state=666)\r\ntrain_index,vaild_index=data_split.split(label,label).__next__()\r\n\r\nlgb_train = lgb.Dataset(all_blending_train[fea], label)\r\nlgb_eval = lgb.Dataset(all_blending_train[fea].iloc[vaild_index], label.iloc[vaild_index], reference=lgb_train)\r\n# lgb_train = lgb.Dataset(all_blending_train[fea].iloc[train_index], label.iloc[train_index])\r\n# lgb_eval = lgb.Dataset(all_blending_train[fea].iloc[vaild_index], label.iloc[vaild_index], reference=lgb_train)\r\n\r\n\r\nparams = {\r\n        'boosting_type':'gbdt', 'num_leaves':63, 'max_depth':-1, 'n_estimators':1200, 'objective':'binary',\r\n        'subsample':0.8, 'colsample_bytree':0.8, 'subsample_freq':1,'num_boost_round':400,\r\n        'learning_rate':0.01, 'random_state':666, 'silent':False,'verbose':1,\r\n        'reg_alpha':0.0,'reg_lambda':1,'feature_fraction':0.8,'min_child_weight':50\r\n}\r\n\r\n\r\nprint(\"start2\")\r\ngbm = lgb.train(params,\r\n                lgb_train,\r\n                valid_sets=lgb_eval,\r\n                verbose_eval=2,\r\n                #  init_model='/home/kesci/work/first_zzp/model/no_title_newn_norm.txt'\r\n                )\r\n\r\n","execution_count":4},{"metadata":{"id":"F104AAF7C411445B99315A204112D967","collapsed":false,"scrolled":false},"cell_type":"code","outputs":[{"output_type":"stream","text":"esim : 4467\nlcnn : 2947\ncnn : 3932\ntfidf : 4250\nsiamese : 3791\nrnn : 5413\n","name":"stdout"}],"source":"for index,i in enumerate(fea):\r\n    print(i,\":\",gbm.feature_importance()[index])","execution_count":5},{"metadata":{"id":"2133FED7461742BB816805A7BC4BD481","collapsed":true,"scrolled":false},"cell_type":"code","outputs":[{"output_type":"execute_result","metadata":{},"data":{"text/plain":"                 0\n0         0.210311\n1         0.185244\n2         0.205428\n3         0.168797\n4         0.195373\n5         0.203070\n6         0.206880\n7         0.210982\n8         0.209267\n9         0.171842\n10        0.230358\n11        0.245966\n12        0.177263\n13        0.212128\n14        0.190028\n15        0.165724\n16        0.196991\n17        0.228026\n18        0.226417\n19        0.169274\n20        0.256199\n21        0.245136\n22        0.167208\n23        0.207261\n24        0.207984\n25        0.151647\n26        0.261249\n27        0.161612\n28        0.036005\n29        0.181454\n...            ...\n99999970  0.206731\n99999971  0.144870\n99999972  0.091068\n99999973  0.233424\n99999974  0.149816\n99999975  0.128108\n99999976  0.113215\n99999977  0.306395\n99999978  0.203325\n99999979  0.202268\n99999980  0.213253\n99999981  0.143598\n99999982  0.190008\n99999983  0.230071\n99999984  0.146531\n99999985  0.158974\n99999986  0.055526\n99999987  0.171059\n99999988  0.045159\n99999989  0.059950\n99999990  0.153400\n99999991  0.098062\n99999992  0.095447\n99999993  0.077830\n99999994  0.230271\n99999995  0.187343\n99999996  0.091649\n99999997  0.063521\n99999998  0.021599\n99999999  0.069386\n\n[100000000 rows x 1 columns]","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>0</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0.210311</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>0.185244</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>0.205428</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>0.168797</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>0.195373</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>0.203070</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>0.206880</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>0.210982</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>0.209267</td>\n    </tr>\n    <tr>\n      <th>9</th>\n      <td>0.171842</td>\n    </tr>\n    <tr>\n      <th>10</th>\n      <td>0.230358</td>\n    </tr>\n    <tr>\n      <th>11</th>\n      <td>0.245966</td>\n    </tr>\n    <tr>\n      <th>12</th>\n      <td>0.177263</td>\n    </tr>\n    <tr>\n      <th>13</th>\n      <td>0.212128</td>\n    </tr>\n    <tr>\n      <th>14</th>\n      <td>0.190028</td>\n    </tr>\n    <tr>\n      <th>15</th>\n      <td>0.165724</td>\n    </tr>\n    <tr>\n      <th>16</th>\n      <td>0.196991</td>\n    </tr>\n    <tr>\n      <th>17</th>\n      <td>0.228026</td>\n    </tr>\n    <tr>\n      <th>18</th>\n      <td>0.226417</td>\n    </tr>\n    <tr>\n      <th>19</th>\n      <td>0.169274</td>\n    </tr>\n    <tr>\n      <th>20</th>\n      <td>0.256199</td>\n    </tr>\n    <tr>\n      <th>21</th>\n      <td>0.245136</td>\n    </tr>\n    <tr>\n      <th>22</th>\n      <td>0.167208</td>\n    </tr>\n    <tr>\n      <th>23</th>\n      <td>0.207261</td>\n    </tr>\n    <tr>\n      <th>24</th>\n      <td>0.207984</td>\n    </tr>\n    <tr>\n      <th>25</th>\n      <td>0.151647</td>\n    </tr>\n    <tr>\n      <th>26</th>\n      <td>0.261249</td>\n    </tr>\n    <tr>\n      <th>27</th>\n      <td>0.161612</td>\n    </tr>\n    <tr>\n      <th>28</th>\n      <td>0.036005</td>\n    </tr>\n    <tr>\n      <th>29</th>\n      <td>0.181454</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>99999970</th>\n      <td>0.206731</td>\n    </tr>\n    <tr>\n      <th>99999971</th>\n      <td>0.144870</td>\n    </tr>\n    <tr>\n      <th>99999972</th>\n      <td>0.091068</td>\n    </tr>\n    <tr>\n      <th>99999973</th>\n      <td>0.233424</td>\n    </tr>\n    <tr>\n      <th>99999974</th>\n      <td>0.149816</td>\n    </tr>\n    <tr>\n      <th>99999975</th>\n      <td>0.128108</td>\n    </tr>\n    <tr>\n      <th>99999976</th>\n      <td>0.113215</td>\n    </tr>\n    <tr>\n      <th>99999977</th>\n      <td>0.306395</td>\n    </tr>\n    <tr>\n      <th>99999978</th>\n      <td>0.203325</td>\n    </tr>\n    <tr>\n      <th>99999979</th>\n      <td>0.202268</td>\n    </tr>\n    <tr>\n      <th>99999980</th>\n      <td>0.213253</td>\n    </tr>\n    <tr>\n      <th>99999981</th>\n      <td>0.143598</td>\n    </tr>\n    <tr>\n      <th>99999982</th>\n      <td>0.190008</td>\n    </tr>\n    <tr>\n      <th>99999983</th>\n      <td>0.230071</td>\n    </tr>\n    <tr>\n      <th>99999984</th>\n      <td>0.146531</td>\n    </tr>\n    <tr>\n      <th>99999985</th>\n      <td>0.158974</td>\n    </tr>\n    <tr>\n      <th>99999986</th>\n      <td>0.055526</td>\n    </tr>\n    <tr>\n      <th>99999987</th>\n      <td>0.171059</td>\n    </tr>\n    <tr>\n      <th>99999988</th>\n      <td>0.045159</td>\n    </tr>\n    <tr>\n      <th>99999989</th>\n      <td>0.059950</td>\n    </tr>\n    <tr>\n      <th>99999990</th>\n      <td>0.153400</td>\n    </tr>\n    <tr>\n      <th>99999991</th>\n      <td>0.098062</td>\n    </tr>\n    <tr>\n      <th>99999992</th>\n      <td>0.095447</td>\n    </tr>\n    <tr>\n      <th>99999993</th>\n      <td>0.077830</td>\n    </tr>\n    <tr>\n      <th>99999994</th>\n      <td>0.230271</td>\n    </tr>\n    <tr>\n      <th>99999995</th>\n      <td>0.187343</td>\n    </tr>\n    <tr>\n      <th>99999996</th>\n      <td>0.091649</td>\n    </tr>\n    <tr>\n      <th>99999997</th>\n      <td>0.063521</td>\n    </tr>\n    <tr>\n      <th>99999998</th>\n      <td>0.021599</td>\n    </tr>\n    <tr>\n      <th>99999999</th>\n      <td>0.069386</td>\n    </tr>\n  </tbody>\n</table>\n<p>100000000 rows × 1 columns</p>\n</div>"},"execution_count":6}],"source":"final_test=pd.read_pickle(\"/home/kesci/test_final_result/all_chunwenben_final.pickle\")\ny_pred = gbm.predict(final_test[fea], num_iteration=gbm.best_iteration)\ny_pred_df=pd.DataFrame(y_pred)\ny_pred_df","execution_count":6},{"metadata":{"id":"66FF23996E584AAFA161B93248C1FBFF","collapsed":false,"scrolled":false},"cell_type":"code","outputs":[{"output_type":"execute_result","metadata":{},"data":{"text/plain":"0    0.188934\ndtype: float64"},"execution_count":8}],"source":"y_pred_df.mean()","execution_count":8},{"metadata":{"id":"230ADE0E67B748F7A460F41898124D6A","collapsed":false,"scrolled":false},"cell_type":"code","outputs":[],"source":"y_pred_df.to_pickle(\"/home/kesci/test_final_result/pro/chunwenben_lgb.pickle\")","execution_count":7},{"metadata":{"id":"57AA30F11D9447B08D511724D9B53917","collapsed":false,"scrolled":false},"cell_type":"code","outputs":[],"source":"import pandas as pd\nesim_test=pd.read_pickle(\"/home/kesci/test_old_result/esim_fea_chunwenben.pickle\")\nlcnn_test=pd.read_pickle(\"/home/kesci/test_old_result/lcnn_fea_chunwenben.pickle\")\ncnn_test=pd.read_pickle(\"/home/kesci/test_old_result/textcnn_fea_chunwenben.pickle\")\ntfidf_test=pd.read_pickle(\"/home/kesci/test_old_result/textcnn_tfidf_fea_chunwenben.pickle\")\nsiamese_test=pd.read_pickle(\"/home/kesci/test_old_result/siamese_fea_chunwenben.pickle\")\nrnn_test=pd.read_pickle(\"/home/kesci/test_old_result/textcnn_rnn_fea_chunwenben.pickle\")\n# sia_have_ctr_test=pd.read_pickle(\"/home/kesci/test_old_result/siamese_ln230000.pickle\")\nsia_no_ctr_test=pd.read_pickle(\"/home/kesci/test_old_result/siamese_fea_no_ctr.pickle\")\n# lgb_test=pd.read_pickle(\"/home/kesci/test_old_result/lgb_1200.pickle\")","execution_count":11},{"metadata":{"id":"740D233CBDC742F48618E0C21F012FAA","collapsed":false,"scrolled":false},"cell_type":"code","outputs":[],"source":"all_blending_test=pd.concat([esim_test,lcnn_test,cnn_test,tfidf_test,siamese_test,rnn_test,sia_no_ctr_test],axis=1)","execution_count":12},{"metadata":{"id":"B3182134CF7C44E2B6D72732D1E1C41E","collapsed":true,"scrolled":false},"cell_type":"code","outputs":[{"output_type":"execute_result","metadata":{},"data":{"text/plain":"              esim      lcnn       cnn     tfidf   siamese       rnn  \\\n0         0.121969  0.141916  0.073899  0.111606  0.154982  0.140959   \n1         0.117944  0.098463  0.132341  0.148480  0.094191  0.129137   \n2         0.147725  0.109690  0.165520  0.159867  0.147043  0.197441   \n3         0.092843  0.066641  0.077392  0.082447  0.093291  0.101332   \n4         0.094665  0.066214  0.085764  0.119898  0.085289  0.109217   \n5         0.047111  0.065137  0.105278  0.110394  0.105845  0.079645   \n6         0.147479  0.073634  0.098026  0.116179  0.084572  0.102837   \n7         0.288006  0.188871  0.302645  0.230800  0.210540  0.233643   \n8         0.173228  0.171690  0.205335  0.167620  0.193849  0.228315   \n9         0.165605  0.153243  0.225469  0.171734  0.190251  0.145583   \n10        0.170893  0.154676  0.253216  0.177713  0.237778  0.105086   \n11        0.239466  0.268798  0.247395  0.242179  0.203153  0.204017   \n12        0.279704  0.293246  0.272070  0.272642  0.238882  0.198520   \n13        0.212291  0.212762  0.304093  0.230825  0.249955  0.240629   \n14        0.300040  0.304623  0.311407  0.362721  0.313988  0.264392   \n15        0.220718  0.265456  0.266225  0.265467  0.272285  0.238509   \n16        0.400686  0.272483  0.295897  0.345037  0.272594  0.265753   \n17        0.147171  0.207351  0.128311  0.207675  0.198607  0.161907   \n18        0.223619  0.242515  0.224425  0.280870  0.217178  0.195734   \n19        0.260673  0.246709  0.230789  0.313580  0.299818  0.289870   \n20        0.325412  0.158417  0.190137  0.099717  0.132595  0.085175   \n21        0.079454  0.091677  0.093115  0.040173  0.045413  0.049665   \n22        0.180176  0.199735  0.250877  0.193992  0.241155  0.185504   \n23        0.187539  0.174926  0.171908  0.225649  0.117927  0.246260   \n24        0.274071  0.281262  0.327212  0.308681  0.240054  0.245932   \n25        0.440706  0.357532  0.489452  0.445731  0.341629  0.346563   \n26        0.309274  0.242823  0.280152  0.378054  0.199798  0.248842   \n27        0.265754  0.255706  0.302223  0.311288  0.234354  0.233913   \n28        0.298531  0.226049  0.149166  0.229260  0.141381  0.160283   \n29        0.216490  0.197633  0.143824  0.203847  0.141776  0.153550   \n...            ...       ...       ...       ...       ...       ...   \n19999970  0.266753  0.253343  0.270979  0.245785  0.204786  0.266794   \n19999971  0.204532  0.232399  0.162781  0.222626  0.105559  0.236778   \n19999972  0.172776  0.229148  0.232781  0.272417  0.079179  0.243299   \n19999973  0.190127  0.169947  0.173790  0.147615  0.124066  0.202914   \n19999974  0.184075  0.215539  0.191995  0.245366  0.108124  0.148000   \n19999975  0.201063  0.206217  0.217305  0.211379  0.120300  0.133529   \n19999976  0.263114  0.206249  0.243361  0.230905  0.165667  0.175402   \n19999977  0.207991  0.186775  0.248626  0.231404  0.191884  0.212956   \n19999978  0.232222  0.192657  0.286255  0.251159  0.228194  0.146540   \n19999979  0.168018  0.133309  0.199191  0.215138  0.174466  0.212487   \n19999980  0.155482  0.173693  0.185505  0.140854  0.159753  0.157590   \n19999981  0.150001  0.153460  0.167973  0.141366  0.131357  0.056574   \n19999982  0.153335  0.253157  0.371184  0.307918  0.192907  0.222712   \n19999983  0.149908  0.125719  0.221791  0.193558  0.104223  0.128290   \n19999984  0.200328  0.246001  0.229898  0.375939  0.215279  0.126337   \n19999985  0.162396  0.139653  0.171453  0.233186  0.104020  0.105896   \n19999986  0.235967  0.192605  0.319645  0.312870  0.198890  0.232433   \n19999987  0.124185  0.141415  0.161588  0.190638  0.118735  0.097540   \n19999988  0.197837  0.199879  0.223809  0.221159  0.125684  0.147585   \n19999989  0.205005  0.186142  0.226290  0.245012  0.197582  0.141685   \n19999990  0.183527  0.247752  0.281995  0.321252  0.265917  0.192180   \n19999991  0.111454  0.112973  0.155026  0.143570  0.068944  0.083836   \n19999992  0.093610  0.154225  0.164779  0.129170  0.105623  0.158423   \n19999993  0.088829  0.153713  0.143548  0.098908  0.079452  0.153971   \n19999994  0.405008  0.225148  0.323865  0.244468  0.224904  0.310106   \n19999995  0.366662  0.236842  0.353278  0.247827  0.218987  0.291947   \n19999996  0.255505  0.160392  0.239725  0.139323  0.076809  0.205082   \n19999997  0.156598  0.167742  0.248009  0.258574  0.018874  0.233181   \n19999998  0.229248  0.330431  0.295004  0.283054  0.029620  0.189635   \n19999999  0.312323  0.224482  0.264713  0.332426  0.023066  0.336843   \n\n          sia_no_ctr  \n0           0.166948  \n1           0.170251  \n2           0.196665  \n3           0.081469  \n4           0.153792  \n5           0.114921  \n6           0.123355  \n7           0.265786  \n8           0.188117  \n9           0.207271  \n10          0.177373  \n11          0.267721  \n12          0.186925  \n13          0.119026  \n14          0.362533  \n15          0.273264  \n16          0.313454  \n17          0.269053  \n18          0.384805  \n19          0.558585  \n20          0.065679  \n21          0.076672  \n22          0.127416  \n23          0.106278  \n24          0.230748  \n25          0.377482  \n26          0.277853  \n27          0.252001  \n28          0.151244  \n29          0.121095  \n...              ...  \n19999970    0.125690  \n19999971    0.066205  \n19999972    0.180717  \n19999973    0.047037  \n19999974    0.135291  \n19999975    0.119224  \n19999976    0.186967  \n19999977    0.167095  \n19999978    0.075292  \n19999979    0.123571  \n19999980    0.112732  \n19999981    0.113430  \n19999982    0.213623  \n19999983    0.131751  \n19999984    0.211075  \n19999985    0.083954  \n19999986    0.146514  \n19999987    0.090615  \n19999988    0.159694  \n19999989    0.192091  \n19999990    0.242633  \n19999991    0.116604  \n19999992    0.151878  \n19999993    0.113973  \n19999994    0.319637  \n19999995    0.308555  \n19999996    0.159968  \n19999997    0.134028  \n19999998    0.169663  \n19999999    0.113268  \n\n[20000000 rows x 7 columns]","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>esim</th>\n      <th>lcnn</th>\n      <th>cnn</th>\n      <th>tfidf</th>\n      <th>siamese</th>\n      <th>rnn</th>\n      <th>sia_no_ctr</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0.121969</td>\n      <td>0.141916</td>\n      <td>0.073899</td>\n      <td>0.111606</td>\n      <td>0.154982</td>\n      <td>0.140959</td>\n      <td>0.166948</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>0.117944</td>\n      <td>0.098463</td>\n      <td>0.132341</td>\n      <td>0.148480</td>\n      <td>0.094191</td>\n      <td>0.129137</td>\n      <td>0.170251</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>0.147725</td>\n      <td>0.109690</td>\n      <td>0.165520</td>\n      <td>0.159867</td>\n      <td>0.147043</td>\n      <td>0.197441</td>\n      <td>0.196665</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>0.092843</td>\n      <td>0.066641</td>\n      <td>0.077392</td>\n      <td>0.082447</td>\n      <td>0.093291</td>\n      <td>0.101332</td>\n      <td>0.081469</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>0.094665</td>\n      <td>0.066214</td>\n      <td>0.085764</td>\n      <td>0.119898</td>\n      <td>0.085289</td>\n      <td>0.109217</td>\n      <td>0.153792</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>0.047111</td>\n      <td>0.065137</td>\n      <td>0.105278</td>\n      <td>0.110394</td>\n      <td>0.105845</td>\n      <td>0.079645</td>\n      <td>0.114921</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>0.147479</td>\n      <td>0.073634</td>\n      <td>0.098026</td>\n      <td>0.116179</td>\n      <td>0.084572</td>\n      <td>0.102837</td>\n      <td>0.123355</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>0.288006</td>\n      <td>0.188871</td>\n      <td>0.302645</td>\n      <td>0.230800</td>\n      <td>0.210540</td>\n      <td>0.233643</td>\n      <td>0.265786</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>0.173228</td>\n      <td>0.171690</td>\n      <td>0.205335</td>\n      <td>0.167620</td>\n      <td>0.193849</td>\n      <td>0.228315</td>\n      <td>0.188117</td>\n    </tr>\n    <tr>\n      <th>9</th>\n      <td>0.165605</td>\n      <td>0.153243</td>\n      <td>0.225469</td>\n      <td>0.171734</td>\n      <td>0.190251</td>\n      <td>0.145583</td>\n      <td>0.207271</td>\n    </tr>\n    <tr>\n      <th>10</th>\n      <td>0.170893</td>\n      <td>0.154676</td>\n      <td>0.253216</td>\n      <td>0.177713</td>\n      <td>0.237778</td>\n      <td>0.105086</td>\n      <td>0.177373</td>\n    </tr>\n    <tr>\n      <th>11</th>\n      <td>0.239466</td>\n      <td>0.268798</td>\n      <td>0.247395</td>\n      <td>0.242179</td>\n      <td>0.203153</td>\n      <td>0.204017</td>\n      <td>0.267721</td>\n    </tr>\n    <tr>\n      <th>12</th>\n      <td>0.279704</td>\n      <td>0.293246</td>\n      <td>0.272070</td>\n      <td>0.272642</td>\n      <td>0.238882</td>\n      <td>0.198520</td>\n      <td>0.186925</td>\n    </tr>\n    <tr>\n      <th>13</th>\n      <td>0.212291</td>\n      <td>0.212762</td>\n      <td>0.304093</td>\n      <td>0.230825</td>\n      <td>0.249955</td>\n      <td>0.240629</td>\n      <td>0.119026</td>\n    </tr>\n    <tr>\n      <th>14</th>\n      <td>0.300040</td>\n      <td>0.304623</td>\n      <td>0.311407</td>\n      <td>0.362721</td>\n      <td>0.313988</td>\n      <td>0.264392</td>\n      <td>0.362533</td>\n    </tr>\n    <tr>\n      <th>15</th>\n      <td>0.220718</td>\n      <td>0.265456</td>\n      <td>0.266225</td>\n      <td>0.265467</td>\n      <td>0.272285</td>\n      <td>0.238509</td>\n      <td>0.273264</td>\n    </tr>\n    <tr>\n      <th>16</th>\n      <td>0.400686</td>\n      <td>0.272483</td>\n      <td>0.295897</td>\n      <td>0.345037</td>\n      <td>0.272594</td>\n      <td>0.265753</td>\n      <td>0.313454</td>\n    </tr>\n    <tr>\n      <th>17</th>\n      <td>0.147171</td>\n      <td>0.207351</td>\n      <td>0.128311</td>\n      <td>0.207675</td>\n      <td>0.198607</td>\n      <td>0.161907</td>\n      <td>0.269053</td>\n    </tr>\n    <tr>\n      <th>18</th>\n      <td>0.223619</td>\n      <td>0.242515</td>\n      <td>0.224425</td>\n      <td>0.280870</td>\n      <td>0.217178</td>\n      <td>0.195734</td>\n      <td>0.384805</td>\n    </tr>\n    <tr>\n      <th>19</th>\n      <td>0.260673</td>\n      <td>0.246709</td>\n      <td>0.230789</td>\n      <td>0.313580</td>\n      <td>0.299818</td>\n      <td>0.289870</td>\n      <td>0.558585</td>\n    </tr>\n    <tr>\n      <th>20</th>\n      <td>0.325412</td>\n      <td>0.158417</td>\n      <td>0.190137</td>\n      <td>0.099717</td>\n      <td>0.132595</td>\n      <td>0.085175</td>\n      <td>0.065679</td>\n    </tr>\n    <tr>\n      <th>21</th>\n      <td>0.079454</td>\n      <td>0.091677</td>\n      <td>0.093115</td>\n      <td>0.040173</td>\n      <td>0.045413</td>\n      <td>0.049665</td>\n      <td>0.076672</td>\n    </tr>\n    <tr>\n      <th>22</th>\n      <td>0.180176</td>\n      <td>0.199735</td>\n      <td>0.250877</td>\n      <td>0.193992</td>\n      <td>0.241155</td>\n      <td>0.185504</td>\n      <td>0.127416</td>\n    </tr>\n    <tr>\n      <th>23</th>\n      <td>0.187539</td>\n      <td>0.174926</td>\n      <td>0.171908</td>\n      <td>0.225649</td>\n      <td>0.117927</td>\n      <td>0.246260</td>\n      <td>0.106278</td>\n    </tr>\n    <tr>\n      <th>24</th>\n      <td>0.274071</td>\n      <td>0.281262</td>\n      <td>0.327212</td>\n      <td>0.308681</td>\n      <td>0.240054</td>\n      <td>0.245932</td>\n      <td>0.230748</td>\n    </tr>\n    <tr>\n      <th>25</th>\n      <td>0.440706</td>\n      <td>0.357532</td>\n      <td>0.489452</td>\n      <td>0.445731</td>\n      <td>0.341629</td>\n      <td>0.346563</td>\n      <td>0.377482</td>\n    </tr>\n    <tr>\n      <th>26</th>\n      <td>0.309274</td>\n      <td>0.242823</td>\n      <td>0.280152</td>\n      <td>0.378054</td>\n      <td>0.199798</td>\n      <td>0.248842</td>\n      <td>0.277853</td>\n    </tr>\n    <tr>\n      <th>27</th>\n      <td>0.265754</td>\n      <td>0.255706</td>\n      <td>0.302223</td>\n      <td>0.311288</td>\n      <td>0.234354</td>\n      <td>0.233913</td>\n      <td>0.252001</td>\n    </tr>\n    <tr>\n      <th>28</th>\n      <td>0.298531</td>\n      <td>0.226049</td>\n      <td>0.149166</td>\n      <td>0.229260</td>\n      <td>0.141381</td>\n      <td>0.160283</td>\n      <td>0.151244</td>\n    </tr>\n    <tr>\n      <th>29</th>\n      <td>0.216490</td>\n      <td>0.197633</td>\n      <td>0.143824</td>\n      <td>0.203847</td>\n      <td>0.141776</td>\n      <td>0.153550</td>\n      <td>0.121095</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>19999970</th>\n      <td>0.266753</td>\n      <td>0.253343</td>\n      <td>0.270979</td>\n      <td>0.245785</td>\n      <td>0.204786</td>\n      <td>0.266794</td>\n      <td>0.125690</td>\n    </tr>\n    <tr>\n      <th>19999971</th>\n      <td>0.204532</td>\n      <td>0.232399</td>\n      <td>0.162781</td>\n      <td>0.222626</td>\n      <td>0.105559</td>\n      <td>0.236778</td>\n      <td>0.066205</td>\n    </tr>\n    <tr>\n      <th>19999972</th>\n      <td>0.172776</td>\n      <td>0.229148</td>\n      <td>0.232781</td>\n      <td>0.272417</td>\n      <td>0.079179</td>\n      <td>0.243299</td>\n      <td>0.180717</td>\n    </tr>\n    <tr>\n      <th>19999973</th>\n      <td>0.190127</td>\n      <td>0.169947</td>\n      <td>0.173790</td>\n      <td>0.147615</td>\n      <td>0.124066</td>\n      <td>0.202914</td>\n      <td>0.047037</td>\n    </tr>\n    <tr>\n      <th>19999974</th>\n      <td>0.184075</td>\n      <td>0.215539</td>\n      <td>0.191995</td>\n      <td>0.245366</td>\n      <td>0.108124</td>\n      <td>0.148000</td>\n      <td>0.135291</td>\n    </tr>\n    <tr>\n      <th>19999975</th>\n      <td>0.201063</td>\n      <td>0.206217</td>\n      <td>0.217305</td>\n      <td>0.211379</td>\n      <td>0.120300</td>\n      <td>0.133529</td>\n      <td>0.119224</td>\n    </tr>\n    <tr>\n      <th>19999976</th>\n      <td>0.263114</td>\n      <td>0.206249</td>\n      <td>0.243361</td>\n      <td>0.230905</td>\n      <td>0.165667</td>\n      <td>0.175402</td>\n      <td>0.186967</td>\n    </tr>\n    <tr>\n      <th>19999977</th>\n      <td>0.207991</td>\n      <td>0.186775</td>\n      <td>0.248626</td>\n      <td>0.231404</td>\n      <td>0.191884</td>\n      <td>0.212956</td>\n      <td>0.167095</td>\n    </tr>\n    <tr>\n      <th>19999978</th>\n      <td>0.232222</td>\n      <td>0.192657</td>\n      <td>0.286255</td>\n      <td>0.251159</td>\n      <td>0.228194</td>\n      <td>0.146540</td>\n      <td>0.075292</td>\n    </tr>\n    <tr>\n      <th>19999979</th>\n      <td>0.168018</td>\n      <td>0.133309</td>\n      <td>0.199191</td>\n      <td>0.215138</td>\n      <td>0.174466</td>\n      <td>0.212487</td>\n      <td>0.123571</td>\n    </tr>\n    <tr>\n      <th>19999980</th>\n      <td>0.155482</td>\n      <td>0.173693</td>\n      <td>0.185505</td>\n      <td>0.140854</td>\n      <td>0.159753</td>\n      <td>0.157590</td>\n      <td>0.112732</td>\n    </tr>\n    <tr>\n      <th>19999981</th>\n      <td>0.150001</td>\n      <td>0.153460</td>\n      <td>0.167973</td>\n      <td>0.141366</td>\n      <td>0.131357</td>\n      <td>0.056574</td>\n      <td>0.113430</td>\n    </tr>\n    <tr>\n      <th>19999982</th>\n      <td>0.153335</td>\n      <td>0.253157</td>\n      <td>0.371184</td>\n      <td>0.307918</td>\n      <td>0.192907</td>\n      <td>0.222712</td>\n      <td>0.213623</td>\n    </tr>\n    <tr>\n      <th>19999983</th>\n      <td>0.149908</td>\n      <td>0.125719</td>\n      <td>0.221791</td>\n      <td>0.193558</td>\n      <td>0.104223</td>\n      <td>0.128290</td>\n      <td>0.131751</td>\n    </tr>\n    <tr>\n      <th>19999984</th>\n      <td>0.200328</td>\n      <td>0.246001</td>\n      <td>0.229898</td>\n      <td>0.375939</td>\n      <td>0.215279</td>\n      <td>0.126337</td>\n      <td>0.211075</td>\n    </tr>\n    <tr>\n      <th>19999985</th>\n      <td>0.162396</td>\n      <td>0.139653</td>\n      <td>0.171453</td>\n      <td>0.233186</td>\n      <td>0.104020</td>\n      <td>0.105896</td>\n      <td>0.083954</td>\n    </tr>\n    <tr>\n      <th>19999986</th>\n      <td>0.235967</td>\n      <td>0.192605</td>\n      <td>0.319645</td>\n      <td>0.312870</td>\n      <td>0.198890</td>\n      <td>0.232433</td>\n      <td>0.146514</td>\n    </tr>\n    <tr>\n      <th>19999987</th>\n      <td>0.124185</td>\n      <td>0.141415</td>\n      <td>0.161588</td>\n      <td>0.190638</td>\n      <td>0.118735</td>\n      <td>0.097540</td>\n      <td>0.090615</td>\n    </tr>\n    <tr>\n      <th>19999988</th>\n      <td>0.197837</td>\n      <td>0.199879</td>\n      <td>0.223809</td>\n      <td>0.221159</td>\n      <td>0.125684</td>\n      <td>0.147585</td>\n      <td>0.159694</td>\n    </tr>\n    <tr>\n      <th>19999989</th>\n      <td>0.205005</td>\n      <td>0.186142</td>\n      <td>0.226290</td>\n      <td>0.245012</td>\n      <td>0.197582</td>\n      <td>0.141685</td>\n      <td>0.192091</td>\n    </tr>\n    <tr>\n      <th>19999990</th>\n      <td>0.183527</td>\n      <td>0.247752</td>\n      <td>0.281995</td>\n      <td>0.321252</td>\n      <td>0.265917</td>\n      <td>0.192180</td>\n      <td>0.242633</td>\n    </tr>\n    <tr>\n      <th>19999991</th>\n      <td>0.111454</td>\n      <td>0.112973</td>\n      <td>0.155026</td>\n      <td>0.143570</td>\n      <td>0.068944</td>\n      <td>0.083836</td>\n      <td>0.116604</td>\n    </tr>\n    <tr>\n      <th>19999992</th>\n      <td>0.093610</td>\n      <td>0.154225</td>\n      <td>0.164779</td>\n      <td>0.129170</td>\n      <td>0.105623</td>\n      <td>0.158423</td>\n      <td>0.151878</td>\n    </tr>\n    <tr>\n      <th>19999993</th>\n      <td>0.088829</td>\n      <td>0.153713</td>\n      <td>0.143548</td>\n      <td>0.098908</td>\n      <td>0.079452</td>\n      <td>0.153971</td>\n      <td>0.113973</td>\n    </tr>\n    <tr>\n      <th>19999994</th>\n      <td>0.405008</td>\n      <td>0.225148</td>\n      <td>0.323865</td>\n      <td>0.244468</td>\n      <td>0.224904</td>\n      <td>0.310106</td>\n      <td>0.319637</td>\n    </tr>\n    <tr>\n      <th>19999995</th>\n      <td>0.366662</td>\n      <td>0.236842</td>\n      <td>0.353278</td>\n      <td>0.247827</td>\n      <td>0.218987</td>\n      <td>0.291947</td>\n      <td>0.308555</td>\n    </tr>\n    <tr>\n      <th>19999996</th>\n      <td>0.255505</td>\n      <td>0.160392</td>\n      <td>0.239725</td>\n      <td>0.139323</td>\n      <td>0.076809</td>\n      <td>0.205082</td>\n      <td>0.159968</td>\n    </tr>\n    <tr>\n      <th>19999997</th>\n      <td>0.156598</td>\n      <td>0.167742</td>\n      <td>0.248009</td>\n      <td>0.258574</td>\n      <td>0.018874</td>\n      <td>0.233181</td>\n      <td>0.134028</td>\n    </tr>\n    <tr>\n      <th>19999998</th>\n      <td>0.229248</td>\n      <td>0.330431</td>\n      <td>0.295004</td>\n      <td>0.283054</td>\n      <td>0.029620</td>\n      <td>0.189635</td>\n      <td>0.169663</td>\n    </tr>\n    <tr>\n      <th>19999999</th>\n      <td>0.312323</td>\n      <td>0.224482</td>\n      <td>0.264713</td>\n      <td>0.332426</td>\n      <td>0.023066</td>\n      <td>0.336843</td>\n      <td>0.113268</td>\n    </tr>\n  </tbody>\n</table>\n<p>20000000 rows × 7 columns</p>\n</div>"},"execution_count":13}],"source":"all_blending_test.columns=['esim','lcnn','cnn','tfidf','siamese','rnn','sia_no_ctr']\nall_blending_test","execution_count":13},{"metadata":{"id":"AF1C9065C48E49B8829A2D0AE8E06AD4","collapsed":false,"scrolled":false},"cell_type":"code","outputs":[],"source":"y_pred = gbm.predict(all_blending_test[fea], num_iteration=gbm.best_iteration)\r\ny_pred_df=pd.DataFrame(y_pred)","execution_count":14},{"metadata":{"id":"CAC6358E53D24C16B2E251DFBA84ADCA","collapsed":false},"cell_type":"code","outputs":[{"output_type":"execute_result","metadata":{},"data":{"text/plain":"0    0.190766\ndtype: float64"},"execution_count":15}],"source":"preds=y_pred_df\npreds.mean()","execution_count":15},{"metadata":{"id":"BCE54CB6EA7E4E099C83CF57F8AB77E3","collapsed":false,"scrolled":true},"cell_type":"code","outputs":[{"output_type":"stream","text":"0\n1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n11\n12\n13\n14\n15\n10.710400342941284\n0.00017881393432617188\n","name":"stdout"}],"source":"data_r_path='/home/kesci/input/bytedance/test_final_part1.csv'\r\ncpu_num=16\r\nstart=0\r\nall_data_num=20000000\r\nnames_out=['query_id','query','query_title_id','title']\r\nusecols_out=['query_id','query_title_id']\r\nhas_head=False\r\nif has_head==False:\r\n    import multiprocessing\r\n    from time import time\r\n    import pandas as pd\r\n    all_data=[]\r\n    #test_final_part1.csv\r\n    def data_read(start,single_data_num,data_real_path):\r\n        data_out=pd.read_csv(data_real_path,header=None,names=names_out,usecols=usecols_out,skiprows=start,nrows=single_data_num)\r\n        return data_out\r\n    time1=time()\r\n    pool = multiprocessing.Pool(processes=cpu_num)\r\n    \r\n    for epoch in range(int(cpu_num)):\r\n        print(epoch)\r\n        single_data_num=int(all_data_num/cpu_num)\r\n        all_data.append(pool.apply_async(data_read, [start+single_data_num*epoch,single_data_num,data_r_path]))\r\n    # single_data_num=20000000/16\r\n    # all_data_num=20000000\r\n    # for epoch in range(int(all_data_num/single_data_num)):\r\n    #     all_data.append(pool.apply_async(data_read, [single_data_num*epoch,single_data_num,data_path+\"test_final_part1.csv\"]))\r\n    pool.close()\r\n    pool.join()\r\n    time2=time()\r\n    print(time2-time1)\r\n    all_data_pro=[single.get() for single in all_data]\r\n    time3=time()\r\n    print(time3-time2)\r\n    all_data_pro=pd.concat(all_data_pro)\r\n    test_data_pred_need=all_data_pro.reset_index(drop=True)\r\n\r\n\r\nresult=pd.concat([test_data_pred_need,preds],axis=1)\r\nresult.to_csv(\"first_zzp/result/sub_logestic_nn_quanbuchunwenben.csv\",header=None,index=None)#####合成三列做 最终的提交结果","execution_count":null},{"metadata":{"id":"5F65E8142EA748119FC14FBB8508DFAE","collapsed":false,"scrolled":false},"cell_type":"code","outputs":[{"output_type":"stream","text":"wget: /opt/conda/lib/libcrypto.so.1.0.0: no version information available (required by wget)\nwget: /opt/conda/lib/libssl.so.1.0.0: no version information available (required by wget)\nwget: /opt/conda/lib/libssl.so.1.0.0: no version information available (required by wget)\n--2019-08-10 18:30:37--  https://www.heywhale.com/kesci_submit\nResolving www.heywhale.com (www.heywhale.com)... 106.15.25.147\nConnecting to www.heywhale.com (www.heywhale.com)|106.15.25.147|:443... connected.\nHTTP request sent, awaiting response... 200 OK\nLength: 6709558 (6.4M) [application/octet-stream]\nSaving to: ‘kesci_submit’\n\nkesci_submit        100%[===================>]   6.40M  21.1MB/s    in 0.3s    \n\n2019-08-10 18:30:38 (21.1 MB/s) - ‘kesci_submit’ saved [6709558/6709558]\n\nKesci Submit Tool 3.2.1\n\n> 已验证Token\n> 提交文件 /home/kesci/work/first_zzp/result/sub_logestic_nn_quanbuchunwenben_lgb_nnln.csv (575822.62 KiB)\n> 已上传 100 %\n> 文件已上传        \n> 服务器响应: 200 提交成功，请等待评审完成\n> 提交完成\n","name":"stdout"}],"source":"!wget -O kesci_submit https://www.heywhale.com/kesci_submit&&chmod +x kesci_submit\r\n!https_proxy=\"http://klab-external-proxy\" ./kesci_submit -file /home/kesci/work/first_zzp/result/sub_logestic_nn_quanbuchunwenben_lgb_nnln.csv -token 02ada54c9760d3e1","execution_count":32},{"metadata":{"id":"67C0AF9D2CB24D7B9E22D8E0215F2392"},"cell_type":"code","outputs":[],"source":"","execution_count":null}],"metadata":{"kernelspec":{"name":"python3","display_name":"Python 3","language":"python"},"language_info":{"name":"python","version":"3.6.4","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat":4,"nbformat_minor":0}